# -*- coding: utf-8 -*-
"""Artigo RBGN final.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1V-fnckNi7sKVPQ9n_AgREYmIBnuvfdVC

# Preparing Code
"""

pip install --upgrade xlrd

import pandas as pd
import numpy as np
import datetime as dt
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import gspread

from scipy import stats
from google.colab import auth
from google.auth import default
from oauth2client.service_account import ServiceAccountCredentials
from statsmodels.stats.outliers_influence import variance_inflation_factor
sns.set()

"""# Importing Data"""

# Autenticando e criando o cliente da API do Google
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

# Função para carregar dados das ações de uma planilha específica
def load_spreadsheet(sheet_name, header_line):
    try:
        spreadsheet = gc.open(sheet_name)
        worksheet = spreadsheet.sheet1  # ou use spreadsheet.get_worksheet(0)
        data = pd.DataFrame(worksheet.get_all_values())
        header = data.iloc[header_line]
        data = data[header_line + 1:]
        data.columns = header
        return data
    except Exception as e:
        print(f"Erro ao carregar planilha {sheet_name}: {e}")
        return None

# Função para limpar e converter dados
def clean_convert_data(data):
    if isinstance(data, str):
        pass
    try:
        data.columns = [col.split()[-1] if col != 'Data' else 'Data' for col in data.columns]
        data.replace('-', pd.NA, inplace=True)
        data['Data'] = pd.to_datetime(data['Data'], dayfirst=True, errors='coerce')
        for col in data.columns[1:]:
            data[col] = pd.to_numeric(data[col].apply(lambda x: x.replace(',', '.') if pd.notna(x) else pd.NA), errors='coerce')
        return data
    except Exception as e:
        print(f"Erro ao limpar e converter dados: {e}")
        return None

# Importar dados BSD
df = load_spreadsheet('Dados BSD', 0)

if df is not None:
    df = clean_convert_data(df)

    # Identificar colunas numéricas e excluir a coluna 'Data'
    cols_numericas = df.select_dtypes(include=[np.number]).columns

    # Converter coluna Data no tipo datatime e em index
    df['Data'] = pd.to_datetime(df['Data'], dayfirst=True, errors='coerce')
    df.set_index('Data', inplace=True)

print([df.head(5), df.tail(5)])

# Importar e preparar dados da planilha de fatores
market_factors = load_spreadsheet('Market Factors NEFIN', 0)

if market_factors is not None:
    market_factors = clean_convert_data(market_factors)

    # Identificar colunas numéricas e excluir a coluna 'Data'
    cols_numericas = market_factors.select_dtypes(include=[np.number]).columns

    # Converter coluna Data no tipo datatime e em index
    market_factors['Data'] = pd.to_datetime(market_factors['Data'], dayfirst=True, errors='coerce')
    market_factors.set_index('Data', inplace=True)

print([market_factors.head(5), market_factors.tail(5)])

# Renomeando as colunas
renomear_colunas = {
    "mkt": "MKT",
    "rf": "RF",
    "mkt-rf": "MKT-RF",
    "smb": "SMB",
    "hml": "HML",
    "mom": "WML",
    "illiq": "IML"
}

market_factors.rename(columns=renomear_colunas, inplace=True)

# Importar dados BSD
dfd = load_spreadsheet('Dados diarios', 0)

if dfd is not None:
    dfd = clean_convert_data(dfd)

    # Identificar colunas numéricas e excluir a coluna 'Data'
    cols_numericas = dfd.select_dtypes(include=[np.number]).columns

    # Converter coluna Data no tipo datatime e em index
    dfd['Data'] = pd.to_datetime(dfd['Data'], dayfirst=True, errors='coerce')
    dfd.set_index('Data', inplace=True)

print([dfd.head(5), dfd.tail(5)])

df2 = pd.concat([dfd, market_factors], axis=1, join='inner')
df2 = df2.dropna()
print(df2)

"""#Correlations"""

# Criar variáveis de prêmio
df2['IBOV_premium'] = df2['IBOV'] - df2['RF']
df2['IBRX_premium'] = df2['IBRx100'] - df2['RF']
df2['IDIV_premium'] = df2['IDIV'] - df2['RF']
df2['IGC_premium'] = df2['IGC'] - df2['RF']
df2['BSD_premium'] = df2['Long'] - df2['RF']
df2['10_premium'] = df2['Rank10'] - df2['RF']
df2['15_premium'] = df2['Rank15'] - df2['RF']
df2['20_premium'] = df2['Rank20'] - df2['RF']

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(20, 8))

# Definindo diferentes estilos de linha
linestyles = ['-', '--', '-.', ':', (0, (3, 1, 1, 1)), (0, (5, 10))]
colors = ['black', 'gray', 'darkgray', 'lightgray', 'gainsboro', 'silver']  # Tons de cinza

# Calculando os retornos acumulados e os valores finais
ranking10_cumprod = (1 + df2['Rank10']).cumprod() - 1
ranking15_cumprod = (1 + df2['Rank15']).cumprod() - 1
ranking20_cumprod = (1 + df2['Rank20']).cumprod() - 1
long_cumprod = (1 + df2['Long']).cumprod() - 1
ls_cumprod = (1 + df2['LS']).cumprod() - 1
ibov_cumprod = (1 + df2['IBOV']).cumprod() - 1
idiv_cumprod = (1 + df2['IDIV']).cumprod() - 1

# Obtendo os valores finais
ranking10_final = ranking10_cumprod.iloc[-1]
ranking15_final = ranking15_cumprod.iloc[-1]
ranking20_final = ranking20_cumprod.iloc[-1]
long_final = long_cumprod.iloc[-1]
ls_final = ls_cumprod.iloc[-1]
ibov_final = ibov_cumprod.iloc[-1]
idiv_final = idiv_cumprod.iloc[-1]

# Plotando os dados com os rótulos atualizados
plt.plot(ranking10_cumprod, label=f'Rank10 ({ranking10_final:.2f})', linestyle=linestyles[1], color=colors[0])
plt.plot(ranking15_cumprod, label=f'Rank15 ({ranking15_final:.2f})', linestyle=linestyles[1], color=colors[1])
plt.plot(ranking20_cumprod, label=f'Rank20 ({ranking20_final:.2f})', linestyle=linestyles[2], color=colors[2])
plt.plot(long_cumprod, label=f'Long_BSD ({long_final:.2f})', linestyle=linestyles[3], color=colors[0])
plt.plot(ls_cumprod, label=f'RankBSD ({ls_final:.2f})', linestyle=linestyles[4], color=colors[1])
plt.plot(ibov_cumprod, label=f'IBOV ({ibov_final:.2f})', linestyle=linestyles[0], color=colors[0])
plt.plot(idiv_cumprod, label=f'IDIV ({idiv_final:.2f})', linestyle=linestyles[0], color=colors[5])

plt.title('Cumulative Return')
plt.xlabel('Year')
plt.ylabel('Portfolio Return')

# Configuração dos rótulos dos eixos y em percentagem
plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])

plt.legend(loc='upper left')
plt.grid(True)

plt.show()

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(20, 8))
# ax.set_facecolor('white')  # Define a cor de fundo do eixo para branco

# Definindo diferentes estilos de linha
linestyles = ['-', '--', '-.', ':', (0, (3, 1, 1, 1)), (0, (5, 10))]
colors = ['black', 'gray', 'darkgray', 'lightgray', 'gainsboro', 'silver']  # Tons de cinza

plt.plot(((1+df_m['Ranking10']).cumprod()-1), label='Rank10', linestyle=linestyles[1], color=colors[0])
plt.plot(((1+df_m['Ranking15']).cumprod()-1), label='Rank15', linestyle=linestyles[1], color=colors[1])
plt.plot(((1+df_m['Ranking20']).cumprod()-1), label='Rank20', linestyle=linestyles[2], color=colors[2])
plt.plot(((1+df_m['Long']).cumprod()-1), label='Long_BSD', linestyle=linestyles[3], color=colors[0])
plt.plot(((1+df_m['LS']).cumprod()-1), label='LS_BSD', linestyle=linestyles[4], color=colors[1])
plt.plot(((1+df_m['IBOV']).cumprod()-1), label='IBOV', linestyle=linestyles[0], color=colors[0])
plt.plot(((1+df_m['IDIV']).cumprod()-1), label='IDIV', linestyle=linestyles[0], color=colors[5])  # Repetindo a cor e estilo para IDIV

plt.title('Cumulative Return')
plt.xlabel('Year')
plt.ylabel('Portfolio Return')

# Configuração dos rótulos dos eixos y em percentagem
plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])

plt.legend(loc='upper left')
plt.grid(True)

plt.show()

fig, axes = plt.subplots(figsize=(20,8))

# Definindo diferentes estilos de linha
linestyles = ['-', '--', '-.', ':', (0, (3, 1, 1, 1)), (0, (5, 10))]
colors = ['black', 'gray', 'darkgray', 'lightgray', 'gainsboro', 'silver']  # Tons de cinza

plt.plot(((1+df_m['Ranking10']).cumprod()-1), label = 'Rank_10', linestyle=linestyles[1], color=colors[0])
plt.plot(((1+df_m['Ranking15']).cumprod()-1), label = 'Rank_15', linestyle=linestyles[1], color=colors[1])
plt.plot(((1+df_m['Ranking20']).cumprod()-1), label = 'Rank_20', linestyle=linestyles[2], color=colors[2])
plt.plot(((1+df_m['Long']).cumprod()-1), label = 'LongBSD', linestyle=linestyles[3], color=colors[0])
plt.plot(((1+df_m['MKT']).cumprod()-1), label = 'MKT', linestyle=linestyles[0], color=colors[0])
plt.plot(((1+df_m['SMB']).cumprod()-1), label = 'SMB', linestyle=linestyles[4], color=colors[1])
plt.plot(((1+df_m['HML']).cumprod()-1), label = 'HML', linestyle=linestyles[3], color=colors[2])
plt.plot(((1+df_m['WML']).cumprod()-1), label = 'WML', linestyle=linestyles[0], color=colors[5])
plt.plot(((1+df_m['IML']).cumprod()-1), label = 'IML', linestyle=linestyles[4], color=colors[1])

plt.title('Cumulative Return')
plt.xlabel('Year')
plt.ylabel('Portfolio Return')
plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
plt.legend(loc='upper left')
plt.grid(True)

# Lista das variáveis de retorno
retorno_variaveis = ['Rank10', 'Rank15', 'Rank20', 'Long', 'IBOV', 'IDIV', 'IBRx100', 'IGC', 'MKT', 'SMB', 'HML', 'WML', 'IML']

# DataFrame para armazenar os retornos acumulados
df2_cumprod = pd.DataFrame(index=df2.index)

# Calculando o retorno acumulado para cada variável
for var in retorno_variaveis:
    df2_cumprod[var] = (1 + df2[var]).cumprod() - 1

# Verificando as primeiras linhas do DataFrame resultante
print(df2_cumprod.tail())

df_correl = df2[[ 'IBOV', 'IDIV', 'Rank10', 'Rank15', 'Rank20', 'Long', 'MKT-RF', 'SMB', 'HML', 'WML', 'IML']]

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Função para calcular as correlações e os valores p
def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    pvals = pd.DataFrame(index=df.columns, columns=df.columns, dtype=float)
    corrs = pd.DataFrame(index=df.columns, columns=df.columns, dtype=float)
    for r in df.columns:
        for c in df.columns:
            if r == c:
                pvals.at[r, c] = np.nan
                corrs.at[r, c] = np.nan
            else:
                corr, p = pearsonr(df[r], df[c])
                pvals.at[r, c] = p
                corrs.at[r, c] = corr
    return corrs.astype(float), pvals.astype(float)

# Calculando correlações e p-values
correlations, p_values = calculate_pvalues(df_correl)

# Certifique-se de que todos os valores são float
correlations = correlations.astype(float)
p_values = p_values.astype(float)

# Criar máscara para mostrar apenas a metade inferior da matriz de correlações
mask = np.triu(np.ones_like(correlations, dtype=bool))

# Configuração do gráfico
plt.figure(figsize=(12, 6))
ax = sns.heatmap(correlations, mask=mask, cmap='coolwarm', cbar=True,
                 annot=False, fmt=".2f",
                 xticklabels=correlations.columns, yticklabels=correlations.columns)

# Função para adicionar anotações personalizadas
def add_annotations(ax):
    for i, j in zip(*np.tril_indices_from(correlations, k=-1)):
        # Correlação e valor p
        corr_value = correlations.iloc[i, j]
        p_value = p_values.iloc[i, j]
        text = f"{corr_value:.2f}\n(p={p_value:.3f})"
        ax.text(j + 0.5, i + 0.5, text, ha='center', va='center', color='black', fontsize=9)

# Adicionando anotações personalizadas
add_annotations(ax)

plt.title('Matriz de Correlação com Valores P')
plt.show()

"""# Quality Statistics"""

# Retorno anualizado

df_m['R10_Ret'] = (1+df['Ranking10']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['R15_Ret'] = (1+df['Ranking15']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['R20_Ret'] = (1+df['Ranking20']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['Long_Ret'] = (1+df['Long']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['IBOV_Ret'] = (1+df['IBOV']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['IDIV_Ret'] = (1+df['IDIV']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['IBRx100_Ret'] = (1+df['IBRx100']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['IGC_Ret'] = (1+df['IGC']).rolling(window=12).agg(lambda x : x.prod())-1

df_m[['R10_Ret', 'R15_Ret', 'R20_Ret', 'Long_Ret', 'IBOV_Ret', 'IDIV_Ret', 'IBRx100_Ret', 'IGC_Ret']].describe()

# Volatilidade

df_m['R10_vol'] = df_m['Ranking10'].rolling(window=12).std() * np.sqrt(12)
df_m['R15_vol'] = df_m['Ranking15'].rolling(window=12).std() * np.sqrt(12)
df_m['R20_vol'] = df_m['Ranking20'].rolling(window=12).std() * np.sqrt(12)
df_m['Long_vol'] = df_m['Long'].rolling(window=12).std() * np.sqrt(12)
df_m['IBOV_vol'] = df_m['IBOV'].rolling(window=12).std() * np.sqrt(12)
df_m['IDIV_vol'] = df_m['IDIV'].rolling(window=12).std() * np.sqrt(12)
df_m['IBRx100_vol'] = df_m['IBRx100'].rolling(window=12).std() * np.sqrt(12)
df_m['IGC_vol'] = df_m['IGC'].rolling(window=12).std() * np.sqrt(12)

df_m[['R10_vol', 'R15_vol', 'R20_vol', 'Long_vol', 'IBOV_vol', 'IDIV_vol', 'IBRx100_vol', 'IGC_vol']].describe()

# Tracking Error

df_m['R10_TE'] = df_m['R10_vol'] * (df_m['R10_Ret'] - df_m['IBOV_Ret'])
df_m['R15_TE'] = df_m['R15_vol'] * (df_m['R15_Ret'] - df_m['IBOV_Ret'])
df_m['R20_TE'] = df_m['R20_vol'] * (df_m['R20_Ret'] - df_m['IBOV_Ret'])
df_m['Long_TE'] = df_m['Long_vol'] * (df_m['Long_Ret'] - df_m['IBOV_Ret'])
df_m['IBOV_TE'] = df_m['IBOV_vol'] * (df_m['IBOV_Ret'] - df_m['IBOV_Ret'])
df_m['IDIV_TE'] = df_m['IDIV_vol'] * (df_m['IDIV_Ret'] - df_m['IBOV_Ret'])
df_m['IBRx100_TE'] = df_m['IBRx100_vol'] * (df_m['IBRx100_Ret'] - df_m['IBOV_Ret'])
df_m['IGC_TE'] = df_m['IGC_vol'] * (df_m['IGC_Ret'] - df_m['IBOV_Ret'])

df_m[['R10_TE', 'R15_TE', 'R20_TE', 'Long_TE', 'IBOV_TE', 'IDIV_TE', 'IBRx100_TE', 'IGC_TE']].describe()

# Desvio-padrão anualizado

df_m['R10_DP'] = df_m['R10_Ret'].rolling(window=12).std()
df_m['R15_DP'] = df_m['R15_Ret'].rolling(window=12).std()
df_m['R20_DP'] = df_m['R20_Ret'].rolling(window=12).std()
df_m['Long_DP'] = df_m['Long_Ret'].rolling(window=12).std()
df_m['IBOV_DP'] = df_m['IBOV_Ret'].rolling(window=12).std()
df_m['IDIV_DP'] = df_m['IDIV_Ret'].rolling(window=12).std()
df_m['IBRx100_DP'] = df_m['IBRx100_Ret'].rolling(window=12).std()
df_m['IGC_DP'] = df_m['IDIV_Ret'].rolling(window=12).std()

df_m[['R10_DP', 'R15_DP', 'R20_DP', 'Long_DP', 'IBOV_DP', 'IDIV_DP', 'IBRx100_DP', 'IGC_DP']].describe()

# Sharpe Ratio

df_m['CDI_Ret'] = (1+df_m['CDI']).rolling(window=12).agg(lambda x : x.prod())-1

df_m['R10_SR'] = (df_m['R10_Ret'] - df_m['CDI_Ret']) / df_m['R10_DP']
df_m['R15_SR'] = (df_m['R15_Ret'] - df_m['CDI_Ret']) / df_m['R15_DP']
df_m['R20_SR'] = (df_m['R20_Ret'] - df_m['CDI_Ret']) / df_m['R20_DP']
df_m['Long_SR'] = (df_m['Long_Ret'] - df_m['CDI_Ret']) / df_m['Long_DP']
df_m['IBOV_SR'] = (df_m['IBOV_Ret'] - df_m['CDI_Ret']) / df_m['IBOV_DP']
df_m['IDIV_SR'] = (df_m['IDIV_Ret'] - df_m['CDI_Ret']) / df_m['IDIV_DP']
df_m['IBRx100_SR'] = (df_m['IBRx100_Ret'] - df_m['CDI_Ret']) / df_m['IBRx100_DP']
df_m['IGC_SR'] = (df_m['IGC_Ret'] - df_m['CDI_Ret']) / df_m['IGC_DP']

df_m[['R10_SR', 'R15_SR', 'R20_SR', 'Long_SR', 'IBOV_SR', 'IDIV_SR','IBRx100_SR', 'IGC_SR']].describe()

df_m['Long_premium'] = df_m['Long'] - df_m['CDI']

def calcular_beta(data, retorno_carteira, retorno_mercado):
    Y = data[retorno_carteira]
    X = sm.add_constant(data[retorno_mercado])
    model = sm.OLS(Y, X).fit()
    return model.params[1]  # Retorna o coeficiente de retorno_mercado

def calcular_alpha_jensen(data, retorno_carteira, retorno_mercado, retorno_livre_risco):
    beta = calcular_beta(data, retorno_carteira, retorno_mercado)
    alpha = np.mean(data[retorno_carteira]) - (beta * np.mean(data[retorno_mercado]) + (1 - beta) * np.mean(data[retorno_livre_risco]))
    return alpha

# Variáveis de interesse e benchmarks
interest_vars = ['R10_Ret', 'R15_Ret', 'R20_Ret', 'Long_Ret', 'IDIV_Ret', 'IBRx100_Ret', 'IGC_Ret']
benchmarks = ['IBOV_Ret', 'IDIV_Ret', 'IBRx100_Ret', 'IGC_Ret']

# Preparando um DataFrame para armazenar os resultados
results = pd.DataFrame(columns=['Interest', 'Benchmark', 'Alpha'])

# Calcular Alpha de Jensen para cada combinação de interesse e benchmark
for interest in interest_vars:
    for benchmark in benchmarks:
        if interest != benchmark:  # Evitando calcular o alpha de uma variável com ela mesma
            # Preparar os dados para os cálculos
            data = df_m[[interest, benchmark, 'CDI_Ret']].dropna()

            # Calcular o alpha de Jensen
            alpha = calcular_alpha_jensen(data, interest, benchmark, 'CDI_Ret')

            # Adicionar ao DataFrame de resultados
            new_row = pd.DataFrame({
                'Interest': [interest],
                'Benchmark': [benchmark],
                'Alpha': [alpha]
            }, index=[0])
            results = pd.concat([results, new_row], ignore_index=True)

# Exibindo os resultados
print(results)

import pandas as pd
import scipy.stats as stats

# Variáveis de interesse e benchmarks
interest_vars = ['R10_Ret', 'R15_Ret', 'R20_Ret', 'Long_Ret']
benchmarks = ['IBOV_Ret', 'IDIV_Ret', 'IBRx100_Ret', 'IGC_Ret']

# Preparando um DataFrame para armazenar os resultados
results = pd.DataFrame(columns=['Interest', 'Benchmark', 'Mean Difference', 't-Statistic', 'p-Value'])

# Realizando o teste t para cada combinação de variável de interesse e benchmark
for interest in interest_vars:
    for benchmark in benchmarks:
        interest_sample = df_m[interest].dropna()
        benchmark_sample = df_m[benchmark].dropna()

        # Teste t para amostras independentes
        t_stat, p_val = stats.ttest_ind(interest_sample, benchmark_sample, equal_var=False)  # Welch's t-test
        mean_diff = interest_sample.mean() - benchmark_sample.mean()

        # Criando um novo DataFrame para a linha atual de resultados
        new_row = pd.DataFrame({
            'Interest': [interest],
            'Benchmark': [benchmark],
            'Mean Difference': [mean_diff],
            't-Statistic': [t_stat],
            'p-Value': [p_val]
        })

        # Concatenando o novo DataFrame com os resultados existentes
        results = pd.concat([results, new_row], ignore_index=True)

# Mostrando os resultados
print(results)

# Variáveis de interesse e benchmarks
interest_vars = ['R10_Ret', 'R15_Ret', 'R20_Ret', 'Long_Ret']
benchmarks = ['IBOV_Ret', 'IDIV_Ret', 'IBRx100_Ret', 'IGC_Ret']

# Preparando um DataFrame para armazenar os resultados
results = pd.DataFrame(columns=['Interest', 'Benchmark', 'Percentage Outperform'])

# Comparando cada variável de interesse com cada benchmark
for interest in interest_vars:
    for benchmark in benchmarks:
        # Conta quantas vezes a variável de interesse supera o benchmark
        outperform_count = (df_m[interest] > df_m[benchmark]).sum()
        total_observations = df_m[[interest, benchmark]].dropna().shape[0]  # Total de observações não-NaN comparáveis
        outperform_percentage = (outperform_count / total_observations) * 100 if total_observations != 0 else 0

        # Adicionando resultados ao DataFrame
        results = pd.concat([results, pd.DataFrame({
            'Interest': [interest],
            'Benchmark': [benchmark],
            'Percentage Outperform': [outperform_percentage]
        })], ignore_index=True)

# Mostrando os resultados
print(results)

# Compute the wealth index by starting with 100 dollars

df_m['IBOV_wealth_index'] = 100 * (1+df_m['IBOV']).cumprod() # The starting value won't matter with drawdowns
df_m['IBOV_previous_peaks'] = df_m['IBOV_wealth_index'].cummax() # Compute the previous peaks
df_m['IBOV_Drawdown'] = (df_m['IBOV_wealth_index'] - df_m['IBOV_previous_peaks'])/df_m['IBOV_previous_peaks'] # Calculate the drawdown in percentage

df_m['Rank_10_wealth_index'] = 100 * (1+df_m['Ranking10']).cumprod()
df_m['Rank_10_previous_peaks'] = df_m['Rank_10_wealth_index'].cummax()
df_m['Rank_10_Drawdown'] = (df_m['Rank_10_wealth_index'] - df_m['Rank_10_previous_peaks'])/df_m['Rank_10_previous_peaks']

df_m['Rank_15_wealth_index'] = 100 * (1+df_m['Ranking15']).cumprod()
df_m['Rank_15_previous_peaks'] = df_m['Rank_15_wealth_index'].cummax()
df_m['Rank_15_Drawdown'] = (df_m['Rank_15_wealth_index'] - df_m['Rank_15_previous_peaks'])/df_m['Rank_15_previous_peaks']

df_m['Rank_20_wealth_index'] = 100 * (1+df_m['Ranking20']).cumprod()
df_m['Rank_20_previous_peaks'] = df_m['Rank_20_wealth_index'].cummax()
df_m['Rank_20_Drawdown'] = (df_m['Rank_20_wealth_index'] - df_m['Rank_20_previous_peaks'])/df_m['Rank_20_previous_peaks']

df_m['Long_wealth_index'] = 100 * (1+df_m['Ranking20']).cumprod()
df_m['Long_previous_peaks'] = df_m['Long_wealth_index'].cummax()
df_m['Long_Drawdown'] = (df_m['Long_wealth_index'] - df_m['Long_previous_peaks'])/df_m['Long_previous_peaks']

df_m['IDIV_wealth_index'] = 100 * (1+df_m['IDIV']).cumprod()
df_m['IDIV_previous_peaks'] = df_m['IDIV_wealth_index'].cummax()
df_m['IDIV_Drawdown'] = (df_m['IDIV_wealth_index'] - df_m['IDIV_previous_peaks'])/df_m['IDIV_previous_peaks']

df_m['IBRx100_wealth_index'] = 100 * (1+df_m['IBRx100']).cumprod()
df_m['IBRx100_previous_peaks'] = df_m['IBRx100_wealth_index'].cummax()
df_m['IBRx100_Drawdown'] = (df_m['IBRx100_wealth_index'] - df_m['IBRx100_previous_peaks'])/df_m['IBRx100_previous_peaks']

df_m['IGC_wealth_index'] = 100 * (1+df_m['IGC']).cumprod()
df_m['IGC_previous_peaks'] = df_m['IGC_wealth_index'].cummax()
df_m['IGC_Drawdown'] = (df_m['IGC_wealth_index'] - df_m['IGC_previous_peaks'])/df_m['IGC_previous_peaks']


print(df_m[['IBOV_wealth_index', 'IBOV_previous_peaks', 'IBOV_Drawdown', 'Rank_15_wealth_index', 'Rank_15_previous_peaks', 'Rank_15_Drawdown']].head())

# Drawdown

df_m['Rank_10_Drawdown_12'] = (1+df_m['Rank_10_Drawdown']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['Rank_15_Drawdown_12'] = (1+df_m['Rank_15_Drawdown']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['Rank_20_Drawdown_12'] = (1+df_m['Rank_20_Drawdown']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['Long_Drawdown_12'] = (1+df_m['Long_Drawdown']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['IBOV_Drawdown_12'] = (1+df_m['IBOV_Drawdown']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['IDIV_Drawdown_12'] = (1+df_m['IDIV_Drawdown']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['IBRx100_Drawdown_12'] = (1+df_m['IBRx100_Drawdown']).rolling(window=12).agg(lambda x : x.prod())-1
df_m['IGC_Drawdown_12'] = (1+df_m['IGC_Drawdown']).rolling(window=12).agg(lambda x : x.prod())-1

df_m[['Rank_10_Drawdown_12', 'Rank_15_Drawdown_12','Rank_20_Drawdown_12', 'Long_Drawdown_12','IBOV_Drawdown_12','IDIV_Drawdown_12','IBRx100_Drawdown_12','IGC_Drawdown_12',]].describe()

fig, ax = plt.subplots(2,1, figsize=(20,10), gridspec_kw={'height_ratios': [2, 1]})

# Definindo diferentes estilos de linha
linestyles = ['-', '--', '-.', ':', (0, (3, 1, 1, 1)), (0, (5, 10))]
colors = ['black', 'gray', 'darkgray', 'lightgray', 'gainsboro', 'silver']  # Tons de cinza

ax[0].plot((1+df_m['Ranking15']).cumprod()-1, label='Rank15', linestyle=linestyles[0], color=colors[0])
ax[0].plot((1+df_m['IBOV']).cumprod()-1, label='IBOV', linestyle=linestyles[0], color=colors[5])
ax[0].plot((1+df_m['IDIV']).cumprod()-1, label='IDIV', linestyle=linestyles[1], color=colors[0])
ax[0].set_title("Cumulative Return")
ax[0].set_ylabel('Portfolio Return')
ax[0].legend(loc='upper left')

vals = ax[0].get_yticks()
ax[0].set_yticklabels(['{:,.0%}'.format(x) for x in vals])

ax[1].plot(df_m['Rank_15_Drawdown'], label='Rank15', linestyle=linestyles[0], color=colors[0])
ax[1].plot(df_m['IBOV_Drawdown'], label='IBOV', linestyle=linestyles[0], color=colors[5])
ax[1].plot(df_m['IDIV_Drawdown'], label='IDIV', linestyle=linestyles[1], color=colors[0])
ax[1].set_ylabel('Drawdown')
ax[1].legend(loc='lower left')

plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])

sns.set()

"""#Regression"""

# Criar a variável dummy para os anos de pandemia
df2['pandemia'] = df2.index.year.isin([2020, 2021]).astype(int)

# Verificar os primeiros valores para confirmar que a variável dummy foi criada corretamente
print(df2['pandemia'].head())

# Opcionalmente, visualizar a distribuição da variável dummy
print(df2['pandemia'].value_counts())

df2[['IBOV', 'IBOV_premium', 'Rank10', '10_premium', 'Rank15', '15_premium', 'Rank20', '20_premium', 'Long', 'BSD_premium', 'LS']].describe()

# Lista das variáveis específicas para as quais você deseja calcular o FIV
selected_columns = ['IBOV_premium', 'Long_premium', 'SMB', 'HML', 'WML', 'IML']  # Substitua 'var1', 'var2', 'var3' pelas colunas de interesse

# Selecionando apenas as colunas desejadas do DataFrame
x_subset = df2[selected_columns]

# Adicionando o termo constante ao subconjunto de variáveis
x_subset = sm.add_constant(x_subset)

# Função para calcular o FIV
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculando FIV
vif_data = calculate_vif(x_subset)
print("VIF for each variable in df_correl:\n", vif_data)

# Definindo as variáveis
y1 = df2['Rank10']
y2 = df2['Rank15']
y3 = df2['Rank20']
y4 = df2['Long']
x = df2[['MKT-RF', 'HML', 'SMB', 'WML', 'IML']]

# Adicionando o termo constante
x = sm.add_constant(x)

# Função para calcular o FIV
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculando FIV
vif_data = calculate_vif(x)
print("VIF for each variable:\n", vif_data)

# Executando as regressões
result1 = sm.OLS(y1, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result2 = sm.OLS(y2, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result3 = sm.OLS(y3, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result4 = sm.OLS(y4, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})

# Imprimindo a tabela resumo
print(result1.summary())
print("≡"*90 + "\n")
print(result2.summary())
print("≡"*90 + "\n")
print(result3.summary())
print("≡"*90 + "\n")
print(result4.summary())

import statsmodels.api as sm
import pandas as pd

# Definindo as variáveis
y1 = df2['Rank10']
y2 = df2['Rank15']
y3 = df2['Rank20']
y4 = df2['Long']
x = df2[['MKT-RF', 'HML', 'SMB', 'WML', 'IML']]

# Adicionando a variável dummy ao DataFrame de variáveis explicativas
x['pandemia'] = df2['pandemia']

# Criando interações entre a dummy e as outras variáveis
for var in ['MKT-RF', 'HML', 'SMB', 'WML', 'IML']:
    x[f'{var}*pandemia'] = x[var] * x['pandemia']

# Adicionando o termo constante após criar as interações
x = sm.add_constant(x)

# Visualizando as primeiras linhas para verificar as novas colunas
print(x.head())

# Função para calcular o FIV
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculando FIV
vif_data = calculate_vif(x)
print("VIF for each variable:\n", vif_data)

# Executando as regressões
result1 = sm.OLS(y1, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result2 = sm.OLS(y2, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result3 = sm.OLS(y3, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result4 = sm.OLS(y4, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})

# Imprimindo a tabela resumo
print(result1.summary())
print("≡"*90 + "\n")
print(result2.summary())
print("≡"*90 + "\n")
print(result3.summary())
print("≡"*90 + "\n")
print(result4.summary())

# Definindo as variáveis
y1 = df2['IBOV_premium']
y2 = df2['IBRX_premium']
y3 = df2['IDIV_premium']
y4 = df2['IGC_premium']
x = df2[['BSD_premium', 'HML', 'SMB', 'WML', 'IML']]

# Adicionando o termo constante
x = sm.add_constant(x)

# Função para calcular o FIV
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculando FIV
vif_data = calculate_vif(x)
print("VIF for each variable:\n", vif_data)

# Executando as regressões
result1 = sm.OLS(y1, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result2 = sm.OLS(y2, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result3 = sm.OLS(y3, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result4 = sm.OLS(y4, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})

# Imprimindo a tabela resumo
print(result1.summary())
print("≡"*90 + "\n")
print(result2.summary())
print("≡"*90 + "\n")
print(result3.summary())
print("≡"*90 + "\n")
print(result4.summary())

# Definindo as variáveis
y1 = df2['IBOV_premium']
y2 = df2['IBRX_premium']
y3 = df2['IDIV_premium']
y4 = df2['IGC_premium']
x = df2[['BSD_premium', 'HML', 'SMB', 'WML', 'IML']]

# Adicionando a variável dummy ao DataFrame de variáveis explicativas
x['pandemia'] = df2['pandemia']

# Criando interações entre a dummy e as outras variáveis
for var in ['BSD_premium', 'HML', 'SMB', 'WML', 'IML']:
    x[f'{var}*pandemia'] = x[var] * x['pandemia']

# Adicionando o termo constante após criar as interações
x = sm.add_constant(x)

# Visualizando as primeiras linhas para verificar as novas colunas
print(x.head())

# Função para calcular o FIV
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculando FIV
vif_data = calculate_vif(x)
print("VIF for each variable:\n", vif_data)

# Executando as regressões
result1 = sm.OLS(y1, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result2 = sm.OLS(y2, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result3 = sm.OLS(y3, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})
result4 = sm.OLS(y4, x, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags':1})

# Imprimindo a tabela resumo
print(result1.summary())
print("≡"*90 + "\n")
print(result2.summary())
print("≡"*90 + "\n")
print(result3.summary())
print("≡"*90 + "\n")
print(result4.summary())

